library(randomForest)
library(plyr)

# ----- set random seed
set.seed(333)

# ----- files

# --- paths
masterSet <- read.csv("A:\\cmac\\Lauren Connor\\Artemisinin ML\\20180802_dataset_corr100.txt", sep = "\t")
masterTestSet <- read.csv("A:\\cmac\\Lauren Connor\\Artemisinin ML\\20180802_ART_testset_CO1\\ART_as_COFORMER1_dataset.txt", sep = "\t")

outfile_path <- "A:\\cmac\\Lauren Connor\\Artemisinin ML\\20180802_ART_testset_CO1\\ART_as_COFORMER1_results.txt"
workspace_img_path <- "A:\\cmac\\Lauren Connor\\Artemisinin ML\\20180802_ART_testset_CO1\\ART_as_COFORMER1_results.RData"

trainingSet <- masterSet
testSet <- masterTestSet

# --- Set the response
trainingResponse <- trainingSet$OUTPUT
#testResponse <- testSet$OUTPUT     # This does not exist!!!
trainingSet$OUTPUT <- NULL

# now trainingSet and testSet have the same column names, we can do this...
testSet <- testSet[, colnames(trainingSet)]
# this ditches any columns not present in trainingSet

# --- Specify columns to "keep aside" - i.e. you want them later, but not in RF

tr_exc_cols <- trainingSet[,1:5]   # one less because no response exists now
te_exc_cols <- testSet[,1:5]   # one less because no response exists

trainingSet[,1:5] <- NULL   # one less because no response exists now
testSet[,1:5] <- NULL   # one less because no response exists

# --- Random Forest time!
data.rf <- randomForest(trainingSet, trainingResponse, ntree=1000, type=classification, replace=TRUE, keep.forest=TRUE, importance=TRUE, proximity=TRUE, log="y")
data.pred <- predict(data.rf, testSet)
data.probpred <- predict(data.rf, testSet, type = "prob")
results <- data.frame(te_exc_cols, data.pred, data.probpred)


write.table(results, outfile_path, row.names = FALSE, quote = FALSE, sep = "\t" )
save.image(workspace_img_path)


